The purpose of this file is to practice applying basic R commands learned through lecture videos and perform exploratory analysis on worldwide COVID-19 data.
library(dplyr)
library(lubridate)
library(knitr)
library(ggplot2)
library(cowplot)Data was read in and summarized as shown below: (current file is as of 5/31)
owid <- read.csv(url("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"), na.strings = c("", "NA"), colClasses = c("date" = "Date"), header = TRUE)
dim(owid)## [1] 92885 59
head(owid)
str(owid)
summary(owid)Data set, including possible confounders for use in later analysis, is cleaned and filtered below.
owid.filtered <- owid %>%
rename(iso.code = iso_code, totcases = total_cases, totdeaths = total_deaths, totcases.mil = total_cases_per_million, totdeaths.mil = total_deaths_per_million, hosp.patients = hosp_patients, hosp.patients.mil = hosp_patients_per_million, pop.density = population_density, med.age = median_age, older70 = aged_70_older, gdp.cap = gdp_per_capita, card.death = cardiovasc_death_rate, diab.prev = diabetes_prevalence, fem.smokers = female_smokers, male.smokers = male_smokers) %>%
select(c(iso.code:totcases, totdeaths, totcases.mil, totdeaths.mil, hosp.patients, hosp.patients.mil, population, pop.density, med.age, older70, gdp.cap, card.death, diab.prev, fem.smokers, male.smokers)) %>% #Renamed variables and filtered to keep only those of interest.
filter(date == "2020-05-31") #Filtered out all but most recent date (most recent date is a running sum of total cases).
table(owid.filtered$continent, useNA = "ifany") #Data includes both continent and country-specific outcomes. Continent outcomes can be distinguished by "NA" in the continent column; the continent name is in the location column. ##
## Africa Asia Europe North America Oceania
## 54 47 46 23 4
## South America <NA>
## 12 9
More exploratory analysis of data frames:
dim(owid.filtered)
head(owid.filtered)
tail(owid.filtered)Data is filtered to remove entries for specific country locations and include only data that aggregates COVID-19 outcomes on a continent and world scale.
owid.continents <- owid.filtered %>%
filter(is.na(continent)) %>%
select(-continent) #Used missingness properties to select only continent rows and then removed continent column.
table(owid.continents$location)##
## Africa Asia Europe European Union International
## 1 1 1 1 1
## North America Oceania South America World
## 1 1 1 1
Data is filtered to include only country-specific COVID-19 outcomes and exclude continent and world-wide data.
owid.countries <- owid.filtered %>%
filter(!is.na(continent)) #Removed all rows with NA for continent, leaving behind only countries.
table(owid.countries$continent)##
## Africa Asia Europe North America Oceania
## 54 47 46 23 4
## South America
## 12
table(owid.filtered$totcases, useNA = "ifany") %>%
tail() #No missing values for total cases column. ##
## 1138778 1160536 1798793 1959812 2031973 6188259
## 1 1 1 1 1 1
table(owid.filtered$totdeaths, useNA = "ifany") %>%
tail() #Several missing values for total deaths column. Keep this in mind when analyzing. ##
## 107859 126360 127102 172683 375061 <NA>
## 1 1 1 1 1 19
table(owid.filtered$hosp.patients, useNA = "ifany") %>%
tail() #Most values are missing. Likely not useful for data analysis. ##
## 2116 2134 6822 7237 14271 <NA>
## 1 1 1 1 1 169
which(is.na(owid.filtered$totdeaths))## [1] 22 31 51 57 63 72 98 101 118 123 138 148 149 150 156 177 182 189 191
#Missing countries: Butan, Cambodia, Dominica, Eritrea, Fiji, Grenada, Laos, Lesotho, Mongolia, Namibia, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Seychelles, Timor, Uganda, Vatican, Vietnam.Filtered OWID data frames further narrowed to COVID-19 cases only, deaths only, and hospitalizations only, separated by continent and by country respectively.
#By continent:
select(owid.continents, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by continent.
select(owid.continents, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by continent.
select(owid.continents, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by continent.
#By country:
select(owid.countries, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by country.
select(owid.countries, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by country.
select(owid.countries, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by country. Plots for exploratory analysis comparing COVID-19 cases by continent for total cases, total cases per million people, total deaths, and total deaths per million people.
continents1 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totcases)) +
geom_bar(stat="identity") +
geom_text(aes(label = totcases), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Cases by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases by continent.
continents2 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totcases.mil)) +
geom_bar(stat="identity") +
geom_text(aes(label = totcases), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Cases per Million People by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases per Million") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases per million by continent.
plot_grid(continents1, continents2, labels = "AUTO")continents3 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totdeaths)) +
geom_bar(stat="identity") +
geom_text(aes(label = totcases), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Deaths by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths by continent.
continents4 <- owid.continents %>%
filter(!(location %in% c("International", "World"))) %>%
ggplot(aes(x = location, y = totdeaths.mil)) +
geom_bar(stat="identity") +
geom_text(aes(label = totcases), vjust = -0.25, size = 3) +
labs(title = "Total COVID-19 Death per Million by Continent") +
labs(x = "Continent", y = "Total COVID-19 Cases") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths per million by continent.
plot_grid(continents3, continents4, labels = "AUTO")Barplots filtering out the 10 countries with the highest COVID-19 cases per million people on each continent, as well as a heatmap of all countries, are the outputs of the code below:
ctry.mean.cases <- owid.countries %>%
group_by(continent) %>%
summarise(totcases.mil = mean(totcases.mil, na.rm = TRUE)) #Mean COVID-19 cases per country, grouped by continent.
mean.cases <- function(x) {
ctry.mean.cases %>%
filter(continent == x) %>%
select(totcases.mil) %>%
as.numeric()
} #Function to produce numeric value of mean COVID-19 cases per country by continent to be used in bar graphs below.
top.ctry.cases <- function(x) {
owid.countries %>%
filter(continent == x) %>%
arrange(desc(totcases.mil)) %>%
head(n=10) %>%
ggplot(aes(x = location, y = totcases.mil)) +
geom_bar(stat = "identity") +
geom_abline(slope=0, intercept= mean.cases(x), col = "red", lty=2) +
labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(top.ctry.cases(i))
} #Loops function for continents of interest. ctry.mean.deaths <- owid.countries %>%
group_by(continent) %>%
summarise(totdeaths.mil = mean(totdeaths.mil, na.rm = TRUE)) #Mean COVID-19 daeths per country, grouped by continent.
mean.deaths <- function(x) {
ctry.mean.deaths %>%
filter(continent == x) %>%
select(totdeaths.mil) %>%
as.numeric()
} #Function to produce numeric value of mean COVID-19 deaths per country by continent to be used in bar graphs below.
top.ctry.deaths <- function(x) {
owid.countries %>%
filter(continent == x) %>%
arrange(desc(totdeaths.mil)) %>%
head(n=10) %>%
ggplot(aes(x = location, y = totdeaths.mil)) +
geom_bar(stat = "identity") +
geom_abline(slope=0, intercept= mean.deaths(x), col = "red", lty=2) +
labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(top.ctry.deaths(i))
} #Loops function for continents of interest. owid %>%
rename(iso.code = iso_code, totcases.mil = total_cases_per_million) %>%
select(c(iso.code:date, totcases.mil)) %>%
filter(!is.na(continent)) %>%
ggplot(aes(x = date, y = location, fill = totcases.mil)) +
geom_tile() +
labs(title = "Total COVID-19 Cases per Million Over Time", size = 22) +
scale_x_date(date_breaks = "1 month") +
scale_fill_gradient(low = "#ffeda0", high = "#f03b20") +
ggsave(file = "test.image.pdf", width = 20, height = 25) #Total cases per million over time by country as heatmap. owid %>%
rename(iso.code = iso_code, new.cases = new_cases) %>%
select(c(iso.code:date, new.cases)) %>%
filter(!is.na(continent)) %>%
ggplot(aes(x = date, y = location, fill = new.cases)) +
geom_tile() +
labs(title = "New COVID-19 Cases per Million Over Time", size = 22) +
scale_x_date(date_breaks = "1 month") +
scale_fill_gradient(low = "#ffeda0", high = "#f03b20", limits = c(0, 20000)) +
ggsave(file = "test.image2.pdf", width = 20, height = 25) #New cases per million over time by country as heatmap. Difficult to scale because range is so large. Population size and population density are evaluated below as confounding variables for COVID-19 cases. These variables are first analyzed on a full-world scale, and then by continent.
ggplot(data = owid.countries, aes(x = population, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
labs(title = ("World COVID-19 Cases per Million People vs. Population Size")) #COVID-19 cases and population size on world scale. Red line is smooth fit; green line is linear fit. cor(owid.countries$totcases.mil, owid.countries$population, use = "complete.obs") ## [1] -0.05455269
pop.lm.cases <- lm(totcases.mil ~ population, data = owid.countries)
pop.lm.cases##
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
##
## Coefficients:
## (Intercept) population
## 1.378e+03 -9.934e-07
summary.lm(pop.lm.cases) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1375.3 -1273.2 -1085.4 170.6 18393.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.378e+03 2.092e+02 6.588 4.54e-10 ***
## population -9.934e-07 1.340e-06 -0.741 0.46
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2750 on 184 degrees of freedom
## Multiple R-squared: 0.002976, Adjusted R-squared: -0.002443
## F-statistic: 0.5492 on 1 and 184 DF, p-value: 0.4596
ggplot(data = owid.countries, aes(x = pop.density, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
labs(title = ("World COVID-19 Cases per Million People vs. Population Density")) #COVID-19 cases and population size on world scale. Red line is smooth fit; green line is linear fit. cor(owid.countries$totcases.mil, owid.countries$pop.density, use = "complete.obs") ## [1] 0.09592666
pop.den.lm.cases <- lm(totcases.mil ~ pop.density, data = owid.countries)
pop.den.lm.cases##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
##
## Coefficients:
## (Intercept) pop.density
## 1231.377 0.152
summary.lm(pop.den.lm.cases) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2157.1 -1177.8 -994.2 130.6 18487.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1231.3772 195.4280 6.301 2.21e-09 ***
## pop.density 0.1520 0.1176 1.293 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2578 on 180 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.009202, Adjusted R-squared: 0.003697
## F-statistic: 1.672 on 1 and 180 DF, p-value: 0.1977
cases.pop <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = population, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Size"))
}
pop.lm.cases.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totcases.mil ~ population,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.pop(i))
print(pop.lm.cases.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1497.85 -345.09 -175.71 -90.24 2662.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.970e+02 1.820e+02 2.182 0.0406 *
## population 1.399e-05 2.434e-06 5.749 1.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 820.1 on 21 degrees of freedom
## Multiple R-squared: 0.6115, Adjusted R-squared: 0.593
## F-statistic: 33.05 on 1 and 21 DF, p-value: 1.047e-05
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2695.4 -2216.3 -1191.6 828.5 16764.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.007e+03 6.407e+02 4.694 2.63e-05 ***
## population -6.117e-06 1.972e-05 -0.310 0.758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3758 on 44 degrees of freedom
## Multiple R-squared: 0.002183, Adjusted R-squared: -0.0205
## F-statistic: 0.09624 on 1 and 44 DF, p-value: 0.7579
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1496.4 -1408.7 -1201.6 507.7 18248.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.509e+03 4.997e+02 3.020 0.00415 **
## population -1.384e-06 1.671e-06 -0.828 0.41190
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3236 on 45 degrees of freedom
## Multiple R-squared: 0.01502, Adjusted R-squared: -0.006873
## F-statistic: 0.686 on 1 and 45 DF, p-value: 0.4119
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -321.11 -232.55 -199.00 -14.54 3069.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.283e+02 9.379e+01 3.500 0.000964 ***
## population -2.899e-06 2.168e-06 -1.337 0.187074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 564.8 on 52 degrees of freedom
## Multiple R-squared: 0.03323, Adjusted R-squared: 0.01464
## F-statistic: 1.787 on 1 and 52 DF, p-value: 0.1871
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## 1 2 3 4
## 16.16 -67.22 196.03 -144.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.078e+01 1.313e+02 0.615 0.601
## population 7.275e-06 9.564e-06 0.761 0.526
##
## Residual standard error: 179.2 on 2 degrees of freedom
## Multiple R-squared: 0.2243, Adjusted R-squared: -0.1635
## F-statistic: 0.5785 on 1 and 2 DF, p-value: 0.5263
##
## Call:
## lm(formula = totcases.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1446.1 -1227.4 -1084.0 11.2 5023.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.319e+03 7.569e+02 1.742 0.112
## population 6.354e-06 1.143e-05 0.556 0.591
##
## Residual standard error: 2204 on 10 degrees of freedom
## Multiple R-squared: 0.02996, Adjusted R-squared: -0.06705
## F-statistic: 0.3088 on 1 and 10 DF, p-value: 0.5906
cases.pop.den <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = pop.density, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Density"))
}
pop.den.lm.cases.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totcases.mil ~ pop.density,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.pop.den(i))
print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1199.9 -668.7 -356.4 -7.5 4244.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1292.670 405.934 3.184 0.00446 **
## pop.density -2.893 1.689 -1.713 0.10149
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1232 on 21 degrees of freedom
## Multiple R-squared: 0.1226, Adjusted R-squared: 0.08078
## F-statistic: 2.933 on 1 and 21 DF, p-value: 0.1015
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2356.3 -2006.6 -957.2 929.5 17129.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.633e+03 5.072e+02 5.191 5.4e-06 ***
## pop.density 1.617e-02 1.751e-01 0.092 0.927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3332 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0001984, Adjusted R-squared: -0.02305
## F-statistic: 0.008531 on 1 and 43 DF, p-value: 0.9268
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3520.3 -1236.9 -1147.3 556.9 18445.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1229.2082 523.4504 2.348 0.0235 *
## pop.density 0.3460 0.3177 1.089 0.2822
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3277 on 43 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.02684, Adjusted R-squared: 0.004206
## F-statistic: 1.186 on 1 and 43 DF, p-value: 0.2822
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -266.99 -236.07 -181.56 -44.54 3127.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 272.6243 102.1690 2.668 0.0102 *
## pop.density -0.1263 0.6192 -0.204 0.8392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 579.3 on 51 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.000815, Adjusted R-squared: -0.01878
## F-statistic: 0.0416 on 1 and 51 DF, p-value: 0.8392
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## 1 2 3 4
## 26.13 12.58 136.11 -174.81
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 273.491 130.823 2.091 0.172
## pop.density -5.367 4.677 -1.148 0.370
##
## Residual standard error: 158 on 2 degrees of freedom
## Multiple R-squared: 0.397, Adjusted R-squared: 0.09555
## F-statistic: 1.317 on 1 and 2 DF, p-value: 0.3699
##
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1791.1 -1201.3 -903.1 -36.4 4919.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 934.07 1096.78 0.852 0.414
## pop.density 25.10 36.77 0.683 0.510
##
## Residual standard error: 2187 on 10 degrees of freedom
## Multiple R-squared: 0.04454, Adjusted R-squared: -0.051
## F-statistic: 0.4662 on 1 and 10 DF, p-value: 0.5103
Population size and population density are evaluated below as confounding variables for COVID-19 deaths. These variables are first analyzed on a full-world scale, and then by continent.
ggplot(data = owid.countries, aes(x = population, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
labs(title = ("World COVID-19 Deaths per Million People in vs. Population Size")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; green line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$population, use = "complete.obs") ## [1] -0.01846469
pop.lm.deaths <- lm(totdeaths.mil ~ population, data = owid.countries)
pop.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
##
## Coefficients:
## (Intercept) population
## 6.043e+01 -1.815e-08
summary.lm(pop.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.13 -57.43 -51.85 -26.37 1177.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.043e+01 1.259e+01 4.800 3.53e-06 ***
## population -1.815e-08 7.652e-08 -0.237 0.813
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 156.4 on 165 degrees of freedom
## (19 observations deleted due to missingness)
## Multiple R-squared: 0.0003409, Adjusted R-squared: -0.005718
## F-statistic: 0.05628 on 1 and 165 DF, p-value: 0.8128
ggplot(data = owid.countries, aes(x = pop.density, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
labs(title = ("World COVID-19 Deaths per Million People in vs. Population Density")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; green line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$pop.density, use = "complete.obs") ## [1] 0.01069476
pop.den.lm.deaths <- lm(totdeaths.mil ~ pop.density, data = owid.countries)
pop.den.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
##
## Coefficients:
## (Intercept) pop.density
## 6.033e+01 9.799e-04
summary.lm(pop.den.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. ##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.69 -57.74 -53.14 -26.85 1176.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.033e+01 1.260e+01 4.789 3.76e-06 ***
## pop.density 9.799e-04 7.199e-03 0.136 0.892
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 157.6 on 162 degrees of freedom
## (22 observations deleted due to missingness)
## Multiple R-squared: 0.0001144, Adjusted R-squared: -0.006058
## F-statistic: 0.01853 on 1 and 162 DF, p-value: 0.8919
deaths.pop <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = population, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Size"))
}
pop.lm.deaths.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totdeaths.mil ~ population,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.pop(i))
print(pop.lm.deaths.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.65 -22.03 -15.34 8.50 155.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.990e+01 1.203e+01 1.654 0.118
## population 8.978e-07 1.424e-07 6.306 1.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.09 on 16 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.7131, Adjusted R-squared: 0.6952
## F-statistic: 39.77 on 1 and 16 DF, p-value: 1.045e-05
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295.08 -139.13 -109.91 22.27 1084.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.527e+02 4.546e+01 3.359 0.00165 **
## population 1.196e-06 1.384e-06 0.864 0.39225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 262.7 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01707, Adjusted R-squared: -0.005785
## F-statistic: 0.7469 on 1 and 43 DF, p-value: 0.3923
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.333 -9.482 -6.405 -1.058 81.553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.171e+01 3.120e+00 3.753 0.000569 ***
## population -5.163e-09 9.759e-09 -0.529 0.599776
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.77 on 39 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.007125, Adjusted R-squared: -0.01833
## F-statistic: 0.2799 on 1 and 39 DF, p-value: 0.5998
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.050 -4.417 -2.140 1.130 49.289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.471e+00 1.514e+00 3.613 0.000734 ***
## population -2.837e-08 3.370e-08 -0.842 0.404077
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.608 on 47 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.01486, Adjusted R-squared: -0.006102
## F-statistic: 0.7089 on 1 and 47 DF, p-value: 0.4041
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.684e+00 NaN NaN NaN
## population -2.529e-08 NaN NaN NaN
##
## Residual standard error: NaN on 0 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 1 and 0 DF, p-value: NA
##
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.024 -35.593 -22.643 3.199 149.180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.238e+01 2.145e+01 1.509 0.162
## population 4.972e-07 3.240e-07 1.534 0.156
##
## Residual standard error: 62.45 on 10 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1096
## F-statistic: 2.354 on 1 and 10 DF, p-value: 0.1559
deaths.pop.den <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = pop.density, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Population Density"))
}
pop.den.lm.cases.cont <- function(x) {
owid.countries %>%
filter(continent == x) %>%
lm(totdeaths.mil ~ pop.density,.) %>%
summary.lm() #Function for linear regression statistics to be included in for loop below.
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.pop.den(i))
print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest. ##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.354 -45.176 -22.267 7.791 252.849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.2768 28.0887 2.822 0.0123 *
## pop.density -0.1761 0.1187 -1.483 0.1574
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 82.43 on 16 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.1209, Adjusted R-squared: 0.06593
## F-statistic: 2.2 on 1 and 16 DF, p-value: 0.1574
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.39 -148.77 -121.59 2.42 1064.84
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 173.722636 40.334921 4.307 9.42e-05 ***
## pop.density -0.001816 0.013921 -0.130 0.897
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 265 on 43 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0003954, Adjusted R-squared: -0.02285
## F-statistic: 0.01701 on 1 and 43 DF, p-value: 0.8968
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.516 -9.817 -6.638 -0.171 80.153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.754973 3.283937 3.884 0.00041 ***
## pop.density -0.001575 0.001856 -0.848 0.40173
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.99 on 37 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.01908, Adjusted R-squared: -0.007434
## F-statistic: 0.7196 on 1 and 37 DF, p-value: 0.4017
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.894 -4.129 -2.571 0.850 49.621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.501368 1.599068 2.815 0.00716 **
## pop.density 0.002969 0.009547 0.311 0.75723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.738 on 46 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.002098, Adjusted R-squared: -0.0196
## F-statistic: 0.0967 on 1 and 46 DF, p-value: 0.7572
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.92739 NaN NaN NaN
## pop.density 0.03486 NaN NaN NaN
##
## Residual standard error: NaN on 0 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 1 and 0 DF, p-value: NA
##
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.83 -32.95 1.20 19.14 86.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.4806 27.9420 -0.125 0.9033
## pop.density 2.2013 0.9367 2.350 0.0406 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.72 on 10 degrees of freedom
## Multiple R-squared: 0.3558, Adjusted R-squared: 0.2913
## F-statistic: 5.522 on 1 and 10 DF, p-value: 0.04064
First, a linear regression is performed on world COVID-19 cases and GDP. Then, scatter plots with a smooth line showing the relationship of GDP per capita and COVID-19 cases per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = gdp.cap, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("World COVID-19 Cases per Million People in vs. GDP per Capita")) #COVID-19 cases and GDP per capita on world scale. Red line is smooth fit; green line is linear fit. cor(owid.countries$totcases.mil, owid.countries$gdp.cap, use = "complete.obs") ## [1] 0.6567062
gdp.lm.cases <- lm(totcases.mil ~ gdp.cap, data = owid.countries)
gdp.lm.cases##
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
##
## Coefficients:
## (Intercept) gdp.cap
## -358.20891 0.08419
summary.lm(gdp.lm.cases) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5365.0 -832.3 -60.2 312.2 15342.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.582e+02 1.985e+02 -1.804 0.0729 .
## gdp.cap 8.419e-02 7.288e-03 11.552 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1911 on 176 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.4313, Adjusted R-squared: 0.428
## F-statistic: 133.5 on 1 and 176 DF, p-value: < 2.2e-16
cases.gdp <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = gdp.cap, y = totcases.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. GDP per Capita"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.gdp(i))
} #Loops function for continents of interest. First, a linear regression is performed on world COVID-19 deaths and GDP. Scatter plots with a smooth line showing the relationship of GDP per capita and COVID-19 deaths per million people are the outputs of the function below:
ggplot(data = owid.countries, aes(x = gdp.cap, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("World COVID-19 Deaths per Million People in vs. GDP per Capita")) #COVID-19 deaths and GDP per capita on world scale. Red line is smooth fit; green line is linear fit. cor(owid.countries$totdeaths.mil, owid.countries$gdp.cap, use = "complete.obs") ## [1] 0.3780182
gdp.lm.deaths <- lm(totdeaths.mil ~ gdp.cap, data = owid.countries)
gdp.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
##
## Coefficients:
## (Intercept) gdp.cap
## 1.167475 0.002812
summary.lm(gdp.lm.deaths) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -316.82 -41.37 -15.05 -2.98 1076.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.167e+00 1.560e+01 0.075 0.94
## gdp.cap 2.812e-03 5.479e-04 5.132 8.3e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 140.9 on 158 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.1429, Adjusted R-squared: 0.1375
## F-statistic: 26.34 on 1 and 158 DF, p-value: 8.298e-07
deaths.gdp <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = gdp.cap, y = totdeaths.mil)) +
geom_point(color = "#333aff") +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. GDP per Capita"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.gdp(i))
} #Loops function for continents of interest. First, scatter plot shows all countries and the relationship between median age and COVID-19 outcomes. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 cases per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.
ggplot(data = owid.countries, aes(x = med.age, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Median Age", y = "Total COVID-19 Cases per Million People") +
labs(title = "World COVID-19 Cases per Million People vs. Median Population Age") #COVID-19 cases vs. median age on world scale. cor(owid.countries$totcases.mil, owid.countries$med.age, use = "complete.obs")## [1] 0.326925
age.lm.cases <- lm(totcases.mil ~ med.age, data = owid.countries)
age.lm.cases##
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## -1161.02 74.91
summary.lm(age.lm.cases) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2317.4 -954.8 -304.1 25.1 18524.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1161.02 518.02 -2.241 0.0263 *
## med.age 74.91 16.32 4.589 8.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1998 on 176 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.1069, Adjusted R-squared: 0.1018
## F-statistic: 21.06 on 1 and 176 DF, p-value: 8.428e-06
exp.model.cases <- lm(log(totcases.mil) ~ med.age, data = owid.countries)
exp.model.cases##
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## 1.6876 0.1259
summary.lm(exp.model.cases)##
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6445 -1.0443 0.0571 1.1717 4.1880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.68764 0.43260 3.901 0.000136 ***
## med.age 0.12587 0.01363 9.234 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.668 on 176 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.3263, Adjusted R-squared: 0.3225
## F-statistic: 85.26 on 1 and 176 DF, p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.## med.age
## 1 30.37978
cases.med.age <- function(x) {
owid.countries %>%
mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
mutate(med.age <- as.factor(med.age)) %>%
filter(continent == x) %>%
ggplot(aes(x = location, y = totcases.mil, fill = factor(med.age))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Cases per Million People by Country in ", (continent = x), " and Median Age"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.med.age(i))
} #Loops function for continents of interest.First, scatter plot shows all countries and the relationship between median age and COVID-19 deaths. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 daeths per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.
ggplot(data = owid.countries, aes(x = med.age, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
geom_smooth(method = "lm", color = "green") +
theme_bw() +
labs(x = "Median Age", y = "Total COVID-19 Deaths per Million People") +
labs(title = "World COVID-19 Deaths per Million People vs. Median Population Age") #COVID-19 deaths vs. median age on world scale. cor(owid.countries$totdeaths.mil, owid.countries$med.age, use = "complete.obs")## [1] 0.3983249
age.lm.deaths <- lm(totdeaths.mil ~ med.age, data = owid.countries)
age.lm.deaths##
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## -106.27 5.04
summary.lm(age.lm.deaths) #Summary statistics for linear fit. ##
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.54 -51.45 -15.42 13.50 712.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -106.2735 29.5129 -3.601 0.000423 ***
## med.age 5.0396 0.9175 5.493 1.53e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109.1 on 160 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.1587, Adjusted R-squared: 0.1534
## F-statistic: 30.17 on 1 and 160 DF, p-value: 1.525e-07
exp.model.deaths <- lm(log(totdeaths.mil) ~ med.age, data = owid.countries)
exp.model.deaths##
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
##
## Coefficients:
## (Intercept) med.age
## -2.1931 0.1377
summary.lm(exp.model.deaths)##
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8420 -1.0420 0.1639 1.0674 3.6210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.19314 0.44359 -4.944 1.92e-06 ***
## med.age 0.13770 0.01379 9.986 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.64 on 160 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.3839, Adjusted R-squared: 0.3801
## F-statistic: 99.71 on 1 and 160 DF, p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.## med.age
## 1 30.37978
deaths.med.age <- function(x) {
owid.countries %>%
mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
mutate(med.age <- as.factor(med.age)) %>%
filter(continent == x) %>%
ggplot(aes(x = location, y = totdeaths.mil, fill = factor(med.age))) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste0("COVID-19 Deaths per Million People by Country in ", (continent = x), " and Median Age"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.med.age(i))
} #Loops function for continents of interest.Graphs below show percentages of female and male smokers by country and the correlation to COVID-19 cases.
ggplot(data = owid.countries, aes(x = fem.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Cases per Million People") +
labs(title = "World COVID-19 Cases per Million People vs. Percentage of Female Smokers") #COVID-19 cases vs. female smokers on world scale. cor(owid.countries$totcases.mil, owid.countries$fem.smokers, use = "complete.obs") ## [1] 0.2013998
ggplot(data = owid.countries, aes(x = male.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Cases per Million People") +
labs(title = "World COVID-19 Cases per Million People vs. Percentage of Male Smokers") #COVID-19 cases vs. male smokers on world scale. cor(owid.countries$totcases.mil, owid.countries$male.smokers, use = "complete.obs") ## [1] -0.06492578
cases.smoking.f <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = fem.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.smoking.f(i)) #Apply function to each continent.
}cases.smoking.m <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = male.smokers, y = totcases.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Cases per Million People") +
labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(cases.smoking.m(i)) #Apply function to each continent.
}Graphs below show percentages of female and male smokers in the world and by country and the correlation to COVID-19 deaths.
ggplot(data = owid.countries, aes(x = fem.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Female Smokers") #COVID-19 deaths vs. female smokers on world scale. cor(owid.countries$totdeaths.mil, owid.countries$fem.smokers, use = "complete.obs") ## [1] 0.4070555
ggplot(data = owid.countries, aes(x = male.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Male Smokers") #COVID-19 deaths vs. male smokers on world scale. cor(owid.countries$totdeaths.mil, owid.countries$male.smokers, use = "complete.obs") ## [1] -0.09632508
deaths.smoking.f <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = fem.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.smoking.f(i)) #Apply function to each continent.
}deaths.smoking.m <- function(x) {
owid.countries %>%
filter(continent == x) %>%
ggplot(aes(x = male.smokers, y = totdeaths.mil)) +
geom_point(color = "#333aff", alpha = 1) +
geom_smooth(color = "red") +
theme_bw() +
labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
print(deaths.smoking.m(i)) #Apply function to each continent.
}World map file is read in and prepared for overlaying with COVID-19 data.
library(sf)
#File downloaded from https://hub.arcgis.com/datasets/esri::world-countries-generalized-1/explore and moved to working directory.
world.map.zip <- "/Users/alanaschreibman/NO2_COVID-19.zip"
if (file.exists("World_Countries_.zip")) file.remove("World_Countries_.zip")
world.sf <- st_read("country_gen_trim.shp") # Read shapefile as an sf object## Reading layer `country_gen_trim' from data source `/Users/alanaschreibman/NO2_COVID-19/country_gen_trim.shp' using driver `ESRI Shapefile'
## Simple feature collection with 256 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -89 xmax: 180 ymax: 83.6236
## Geodetic CRS: WGS 84
class(world.sf)## [1] "sf" "data.frame"
head(world.sf)## Simple feature collection with 6 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -176.6431 ymin: -21.94084 xmax: -138.8098 ymax: 0.808609
## Geodetic CRS: WGS 84
## OBJECTID FIPS_CNTRY ISO_2DIGIT ISO_3DIGIT NAME COUNTRYAFF
## 1 1 AQ AS ASM American Samoa United States
## 2 2 WQ UM UMI Baker United States
## 3 3 CW CK COK Cook Islands New Zealand
## 4 4 FP PF PYF French Polynesia France
## 5 5 WQ UM UMI Howland United States
## 6 6 WQ UM UMI Jarvis United States
## CONTINENT SHAPE_Leng SHAPE_Area geometry
## 1 Oceania 0.60012436 1.371972e-02 MULTIPOLYGON (((-170.7439 -...
## 2 Oceania 0.02887532 3.430265e-05 MULTIPOLYGON (((-176.4614 0...
## 3 Oceania 0.98066416 1.307346e-02 MULTIPOLYGON (((-159.747 -2...
## 4 Oceania 3.93021062 1.753321e-01 MULTIPOLYGON (((-149.1792 -...
## 5 Oceania 0.05303007 1.811934e-04 MULTIPOLYGON (((-176.6362 0...
## 6 Oceania 0.09758213 6.658148e-04 MULTIPOLYGON (((-160.0211 -...
world.geo <- st_geometry(world.sf)
plot(world.geo)plot(world.geo, col = "lemonchiffon2")plot(world.geo, lwd = 0.5, border = "red")Joins OWID data set with world map data frame and then use country cases to shade countries accordingly.
library(RColorBrewer)
library(viridis)
my_theme <- function() {
theme_minimal() +
theme(axis.line = element_blank(),
plot.title = element_text(size=30),
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_line(color = "white"),
legend.key.size = unit(0.8, "cm"),
legend.text = element_text(size = 20),
legend.title = element_text(size = 20))
}
myPalette <- colorRampPalette(brewer.pal(9, "BuPu"))
world.sf[105, "NAME"] <- "Democratic Republic of Congo"
world.sf <- world.sf %>%
rename(location = NAME) %>%
inner_join(owid.countries, world.sf, by = "location") #Joined OWID data set with .sph file for geospatial analysis.
ggplot(data = world.sf, aes(fill = totcases.mil),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Population-Adjusted Distribution of COVID-19 Cases") +
scale_fill_gradientn(name = "COVID-19 \ncases (per million)",
limits = c(0, 6500),
colours = myPalette(100)) #Plot of world distribution of COVID-19 cases with color shading. Uses merged data frame and shades deaths by country accordingly.
ggplot(data = world.sf, aes(fill = totdeaths.mil),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Population-Adjusted Distribution of COVID-19 Deaths") +
scale_fill_gradientn(name = "COVID-19 \ndeaths (per million)",
limits = c(0, 700),
colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. Uses merged data frame and shades GDP per capita by country accordingly.
ggplot(data = world.sf, aes(fill = gdp.cap),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Global Distribution of GDP per Capita") +
scale_fill_gradientn(name = "GDP \nper capita",
limits = c(0, 60000),
colours = myPalette(100)) Uses merged data frame and shades population density by country accordingly.
ggplot(data = world.sf, aes(fill = pop.density),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Global Distribution of Population Density") +
scale_fill_gradientn(name = "Population \ndensity",
limits = c(0, 450),
colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. Uses merged data frame and shades median age by country accordingly.
ggplot(data = world.sf, aes(fill = med.age),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Global Distribution of Median Age") +
scale_fill_gradientn(name = "Median \n age",
limits = c(0, 50),
colours = myPalette(100)) Uses merged data frame and shades male and female smoking percentages by country accordingly.
ggplot(data = world.sf, aes(fill = fem.smokers),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Percentage of Female Smokers by Country") +
scale_fill_gradientn(name = "Female \nsmokers (%)",
limits = c(0, 40),
colours = myPalette(100)) ggplot(data = world.sf, aes(fill = male.smokers),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Percentage of Male Smokers by Country") +
scale_fill_gradientn(name = "Male \nsmokers (%)",
limits = c(0, 80),
colours = myPalette(100)) Uses merged data frame and shades diabetes prevalence by country accordingly.
ggplot(data = world.sf, aes(fill = diab.prev),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Diabetes Prevalence by Country") +
scale_fill_gradientn(name = "Diabetes \nprevalence (%)",
limits = c(0, 20),
colours = myPalette(100)) Uses merged data frame and shades indicence of cardiovascular death by country accordingly.
ggplot(data = world.sf, aes(fill = card.death),
lwd = 0.05) +
geom_sf() +
my_theme() +
labs(title = "Incidence of Cardiovascular Death by Country") +
scale_fill_gradientn(name = "Diabetes \nprevalence (%)",
limits = c(0, 800),
colours = myPalette(100))